***********************************************************************************
* Merging NEW-OWES-ONET-SOC-CEN2010-IPUMS with CPS panel data 1-year ASEC linkage
* last modified 7/22/2025
*
* Note: Need to aggregate SOC in NEW-OWES-ONET-SOC-CEN2010-IPUMS into CEN2010 in CPS
*
***********************************************************************************


clear
capture log close 
set more off

***************
* Directories *
***************

cd "/Users/"

global dofile			=	"./Code/NEW-OEWS"
global matrix_data		=   "./Data/National Employment Matrix"
global oews_data		= 	"./Data/OEWS/all_excel"
global onet_data		= 	"./OccLink/ONET/ONET data/merged"
global cps_linkeddata	= 	"./Data/CPS/IPUMS/00016-1yearlinked"
global xwalk 			=	"./Data/Crosswalk"
global results 			=	"./Results"
global temp				= 	"./Temp"
global logfile          =   "./Results/"


********************************
* Clean CPS data               *
********************************

use "${cps_linkeddata}/cps_00016.dta", replace

** generate a unique person id **
gen personid = cpsidv_2

** generate an immigrant variable **
recode citizen_2 (1 2 3 = 1 "US born") (4 = 2 "Naturalized") (5 = 3 "non-citizen"), gen(immigrant)

** generate a marital status variable **
recode marst_2 (1 2 = 1 "married") (3 4 5 = 2 "separated/divorced/widowed") (6 = 3 "single/never married"), gen(marital)

** generate a race variable **
recode hispan_2 (0 901 902 = 0 "Not") (100/612 = 1 "Hispanic"), gen(hispanic)
rename race_2 raceX
recode raceX (100 = 1 "White") (200 = 2 "Black") (else = 3 "Asians and Others"), gen(race)

** generate employment status variable
recode empstat_2 (0=0 "NIU, ages 0-14") (1/12=1 "employed") (21/22=2 "unemployed") (30/36=3 "Not in labor force"), gen(empstatus)

** generate education group variable
recode educ_2 (2/72 = 1 "Below HS") (73 = 2 "HS") (80/110 = 3 "Some college") (111/125 = 4 "BA or above") (else = .), gen(edu_gr)

** generate age group variable
recode age_2 (15/35 = 1 "15-35") (36/55 = 2 "36-56") (56/85= 3 "56+"), gen(age_gr)

** generate sex variables
recode sex_2 (1=1 "male") (2=0 "female"), gen(male)

** generate a weight variable **
clonevar weight = asecwth_2

** generate occupation variables **
rename occ2010_2 occ_t1
rename occ10ly_2 occ_t0

save "${temp}/cps_2000-2020_temp_panel.dta", replace


********************************
* Merge OCC data with CPS data *
********************************
/* prepare origin occ data */
use "${temp}/oews_matrix_ooh2000-2020_ipums_occ.dta", replace

rename year matchyear
rename ipums2010 occ_t0

foreach var of varlist empsize projsize proj_pc_growth annual_mean annual_pct10 annual_pct90 emp_chng_rate emp_pc_growth outlook outlook_gr projjobopen {
	rename `var' `var'_t0
}

save "${temp}/origin_occ_panel.dta", replace

/* prepare destination occ data */
use "${temp}/oews_matrix_ooh2000-2020_ipums_occ.dta", replace
rename year matchyear
rename ipums2010 occ_t1

foreach var of varlist empsize projsize proj_pc_growth annual_mean annual_pct10 annual_pct90 emp_chng_rate emp_pc_growth outlook outlook_gr projjobopen {
	rename `var' `var'_t1
}

save "${temp}/destination_occ_panel.dta", replace


/* merge occ with CPS */
use "${temp}/cps_2000-2020_temp_panel.dta", replace

* generate match year
gen 	 matchyear = year_2  if inlist(year_2, 2000, 2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2019, 2020)


merge m:1 matchyear occ_t0 using "${temp}/origin_occ_panel.dta", keep(1 3) nogenerate
merge m:1 matchyear occ_t1 using "${temp}/destination_occ_panel.dta", keep(1 3) nogenerate


/* adjust inflation, in 2020 dollars january to january */
foreach var of newlist annual_mean annual_pct10 annual_pct90 {
	foreach time of newlist t0 t1 {
	
	gen adj_`var'_`time' = `var'_`time' * 1.53 if year_2 == 2000
	replace adj_`var'_`time' = `var'_`time' * 1.47 if year_2 == 2001
	replace adj_`var'_`time' = `var'_`time' * 1.46 if year_2 == 2002
	replace adj_`var'_`time' = `var'_`time' * 1.42 if year_2 == 2003
	replace adj_`var'_`time' = `var'_`time' * 1.39 if year_2 == 2004
	replace adj_`var'_`time' = `var'_`time' * 1.35 if year_2 == 2005
	replace adj_`var'_`time' = `var'_`time' * 1.30 if year_2 == 2006
	replace adj_`var'_`time' = `var'_`time' * 1.27 if year_2 == 2007
	replace adj_`var'_`time' = `var'_`time' * 1.22 if year_2 == 2008
	replace adj_`var'_`time' = `var'_`time' * 1.22 if year_2 == 2009
	replace adj_`var'_`time' = `var'_`time' * 1.19 if year_2 == 2010
	replace adj_`var'_`time' = `var'_`time' * 1.17 if year_2 == 2011
	replace adj_`var'_`time' = `var'_`time' * 1.14 if year_2 == 2012
	replace adj_`var'_`time' = `var'_`time' * 1.12 if year_2 == 2013
	replace adj_`var'_`time' = `var'_`time' * 1.10 if year_2 == 2014
	replace adj_`var'_`time' = `var'_`time' * 1.10 if year_2 == 2015
	replace adj_`var'_`time' = `var'_`time' * 1.09 if year_2 == 2016
	replace adj_`var'_`time' = `var'_`time' * 1.06 if year_2 == 2017
	replace adj_`var'_`time' = `var'_`time' * 1.04 if year_2 == 2018
	replace adj_`var'_`time' = `var'_`time' * 1.02 if year_2 == 2019
	replace adj_`var'_`time' = `var'_`time' * 1.00 if year_2 == 2020	
	}
	
}



keep year_2 personid age_2 sex_2 race hispanic immigrant marital empstatus edu_gr age_gr male ///
	 occ_t0 empsize_t0 projsize_t0 proj_pc_growth_t0 adj_annual_mean_t0 adj_annual_pct10_t0 adj_annual_pct90_t0 emp_chng_rate_t0 	emp_pc_growth_t0 outlook_t0 outlook_gr_t0  ///
	 occ_t1 empsize_t1 projsize_t1 proj_pc_growth_t1 adj_annual_mean_t1 adj_annual_pct10_t1 adj_annual_pct90_t1 emp_chng_rate_t1 	emp_pc_growth_t1 outlook_t1 outlook_gr_t1 ///
	 weight ///
	 projjobopen_t0 projjobopen_t1 ///
	 inclongj_2 inclongj_1 incwage_1 incwage_2
	 


********************************
* Mark Sample                  *
********************************

/* keep age in 16-65 */
keep if inrange(age_2, 16, 65)

/* keep years with available OOH data */
keep if inlist(year_2, 2000, 2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)


save "${temp}/cps_2000-2020_wide_panel.dta", replace

***********************************************************
* Reshape the data for discrete choice models for movers  *
***********************************************************

use "${temp}/cps_2000-2020_wide_panel.dta", replace

/* Keep movers */
keep if occ_t0 != occ_t1

/* Keep civilian, employed population */
drop if occ_t0 >= 9800 | occ_t1 >= 9800

save "${temp}/cps_2000-2020_wide_movers_panel.dta", replace




log using "${logfile}/TableF8.log", replace
 

********************************************************************************************
* Appendix Table logit Model for Upward Mobility Based on 5% Change in Individual Earnings *
********************************************************************************************

use "${temp}/cps_2000-2020_wide_movers_panel.dta", replace

* Define the mobility variable

recode incwage_1 (99999999 = .) (99999998 = .) (0 = .)
recode incwage_2 (99999999 = .) (99999998 = .) (0 = .)


gen xx = incwage_2/incwage_1

gen mobile = 1 if xx > 1.05 & !missing(incwage_1) & !missing(incwage_2) & occ_t0 != occ_t1 /*upward*/

replace mobile = 2 if inrange(xx, 0.95, 1.05) & !missing(incwage_1) & !missing(incwage_2) & occ_t0 != occ_t1 /*horizontal*/

replace mobile = 0 if xx < 0.95 & !missing(incwage_1) & !missing(incwage_2) & occ_t0 != occ_t1 /*dowanward mobility*/


lab def mobile 0 "downward" 1 "upward" 2 "horizontal" 
lab val mobile mobile

recode mobile (0=1 "downward") (1 2 = 0 "not downward"), gen(downward)
recode mobile (1=1 "upward") (0 2 = 0 "not upward"), gen(upward)

* rescale growth variables in terms of 10 percent change

gen emp_pc_growth10_t0 = emp_pc_growth_t0/10
gen emp_pc_growth10_t1 = emp_pc_growth_t1/10

gen proj_pc_growth10_t0 = proj_pc_growth_t0/10
gen proj_pc_growth10_t1 = proj_pc_growth_t1/10


* Mark the analytical sample

mark good if emp_chng_rate_t0 !=. & emp_chng_rate_t1 != . & /* 
*/ proj_pc_growth_t0 != . & proj_pc_growth_t1 != . & /*
*/ outlook_gr_t0 != .  & outlook_gr_t1 != .  & /*
*/ adj_annual_mean_t0 != . 


* m1: individual variables + 2-year employment change

gen lograte_t0 = log(1+emp_chng_rate_t0)
gen lograte_t1 = log(1+emp_chng_rate_t1)

/* model 1 */
logit upward emp_pc_growth10_t0 emp_pc_growth10_t1               /*
*/ i.male ib2.age_gr ib1.race i.hispanic ib1.edu_gr ib1.year /*
*/ if good == 1  


* m2: individual variables + 10-year projected growth

gen logprojrate_t0 = log(1+proj_pc_growth_t0/100)
gen logprojrate_t1 = log(1+proj_pc_growth_t1/100)

/* model 2 */
logit upward proj_pc_growth10_t0 proj_pc_growth10_t1             /*
*/ i.male ib2.age_gr ib1.race i.hispanic ib1.edu_gr ib1.year /*
*/ if good == 1   

* m3: individual variables + projected OOH outlook categories

/* model 3 */
logit upward ib2.outlook_gr_t0 ib2.outlook_gr_t1            /*
*/	i.male ib2.age_gr ib1.race i.hispanic ib1.edu_gr ib1.year /* 
*/ if good == 1  

/* model 4 */
logit upward ib2.outlook_gr_t0##ib2.outlook_gr_t1           /*
*/	i.male ib2.age_gr ib1.race i.hispanic ib1.edu_gr ib1.year /* 
*/ if good == 1  


log close
